import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date
import warnings
warnings.filterwarnings('ignore')
# data wrangling for the original data source
df = pd.read_csv("online_retail_listing.csv", delimiter = ';', encoding= 'unicode_escape')
df.dropna(how='any', inplace=True)
df.rename(columns= {'Price':'Unit Price', 'Country,,,,':'Country', 'InvoiceDate':'Order Date'}, inplace=True)
df['Unit Price']=df['Unit Price'].str.replace(',','.')
df['Unit Price']=df['Unit Price'].astype('float64')
df['Quantity']=df['Quantity'].astype('int64')
df['Order Date']=df['Order Date'].astype('datetime64')
df['Today']= '2012-02-13 12:00:00'
df['Today']= df['Today'].astype('datetime64')
df['Monetary Value'] = df['Unit Price']*df['Unit Price']
df.drop_duplicates(inplace=True)
df.columns
Index(['Invoice', 'StockCode', 'Description', 'Quantity', 'Order Date',
'Unit Price', 'Customer ID', 'Country', 'Today', 'Monetary Value'],
dtype='object')
# integrate a dataset ready for calculation
rfm=df[['Customer ID']]
rfm['First Order Date'] = df.groupby(['Customer ID'])['Order Date'].transform('min')
rfm['Last Order Date'] = df.groupby(['Customer ID'])['Order Date'].transform('max')
rfm['Interval'] = df['Today'] - rfm['Last Order Date']
rfm['Interval'] = rfm['Interval'].astype('timedelta64[D]')
rfm['Interval'] = rfm['Interval'].astype('int')
rfm['Lifetime'] = abs(rfm['First Order Date'] - rfm['Last Order Date'])
rfm['Lifetime'] = rfm['Lifetime'].astype('timedelta64[D]')
rfm['Lifetime'] = rfm['Lifetime'].astype('int')
rfm['Frequency'] = df.groupby(['Customer ID'])['Order Date'].transform('count')
rfm['Total Spent'] = df.groupby(['Customer ID'])['Monetary Value'].transform('sum').round(2)
rfm.drop_duplicates(inplace=True)
rfm.reset_index(inplace=True)
rfm.drop(['index'], axis=1, inplace=True)
rfm.sample(5)
| Customer ID | First Order Date | Last Order Date | Interval | Lifetime | Frequency | Total Spent | |
|---|---|---|---|---|---|---|---|
| 2035 | 14651.0 | 2010-03-18 11:39:00 | 2011-12-05 16:29:00 | 69 | 627 | 283 | 9583.83 |
| 689 | 17603.0 | 2009-10-12 09:44:00 | 2011-10-20 19:19:00 | 115 | 738 | 115 | 602030.72 |
| 5656 | 17832.0 | 2011-10-21 11:43:00 | 2011-10-21 11:43:00 | 115 | 0 | 60 | 179.03 |
| 4887 | 15952.0 | 2011-02-10 12:47:00 | 2011-06-11 11:18:00 | 247 | 120 | 84 | 1497.36 |
| 416 | 16104.0 | 2009-06-12 10:41:00 | 2011-09-10 10:23:00 | 156 | 819 | 241 | 5546.57 |
rfm["FrequencyScore"] = pd.cut(rfm["Frequency"],
bins=[-1,
np.percentile(rfm["Frequency"], 25),
np.percentile(rfm["Frequency"], 50),
np.percentile(rfm["Frequency"], 75),
rfm["Frequency"].max()],
labels=[1, 2, 3, 4]).astype("int")
rfm["MonetaryScore"] = pd.cut(rfm["Total Spent"],
bins=[-1,
np.percentile(rfm["Total Spent"], 25),
np.percentile(rfm["Total Spent"], 50),
np.percentile(rfm["Total Spent"], 75),
rfm["Total Spent"].max()],
labels=[1, 2, 3, 4]).astype("int")
rfm["RecencyScore"] = pd.cut(rfm["Interval"],
bins=[-1,
np.percentile(rfm["Interval"], 25),
np.percentile(rfm["Interval"], 50),
np.percentile(rfm["Interval"], 75),
rfm["Interval"].max()],
labels=[4, 3, 2, 1]).astype("int")
# calculate RFM scores
rfm['RFM Score'] = rfm["FrequencyScore"] + rfm["MonetaryScore"] + rfm["RecencyScore"]
# segmentize customer group based on RFM scores
rfm["Loyalty"] = pd.cut(rfm["RFM Score"],
bins=[-1,
np.percentile(rfm["RFM Score"], 33),
np.percentile(rfm["RFM Score"], 66),
rfm["RFM Score"].max()],
labels=['Silver', 'Gold', 'Platinum']).astype("str")
rfm
| Customer ID | First Order Date | Last Order Date | Interval | Lifetime | Frequency | Total Spent | FrequencyScore | MonetaryScore | RecencyScore | RFM Score | Loyalty | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13085.0 | 2009-01-12 07:45:00 | 2011-05-07 12:11:00 | 281 | 845 | 88 | 690628.40 | 3 | 4 | 2 | 9 | Gold |
| 1 | 13078.0 | 2009-01-12 09:06:00 | 2011-12-10 14:48:00 | 64 | 1062 | 821 | 19615.11 | 4 | 4 | 4 | 12 | Platinum |
| 2 | 15362.0 | 2009-01-12 09:08:00 | 2010-09-17 10:37:00 | 514 | 613 | 40 | 840.92 | 2 | 2 | 1 | 5 | Silver |
| 3 | 18102.0 | 2009-01-12 09:24:00 | 2011-11-28 12:55:00 | 76 | 1050 | 1027 | 22629772.57 | 4 | 4 | 4 | 12 | Platinum |
| 4 | 12682.0 | 2009-01-12 09:28:00 | 2011-12-08 15:22:00 | 66 | 1060 | 962 | 28539.73 | 4 | 4 | 4 | 12 | Platinum |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5918 | 12587.0 | 2011-02-12 12:47:00 | 2011-02-12 12:47:00 | 365 | 0 | 4 | 1612.98 | 1 | 3 | 2 | 6 | Silver |
| 5919 | 13560.0 | 2011-02-12 13:11:00 | 2011-02-12 13:11:00 | 365 | 0 | 32 | 358.05 | 2 | 2 | 2 | 6 | Silver |
| 5920 | 15539.0 | 2011-02-12 13:36:00 | 2011-02-12 13:36:00 | 365 | 0 | 41 | 101.37 | 2 | 1 | 2 | 5 | Silver |
| 5921 | 13017.0 | 2011-02-12 15:53:00 | 2011-02-12 15:53:00 | 365 | 0 | 1 | 18.06 | 1 | 1 | 2 | 4 | Silver |
| 5922 | 13153.0 | 2011-04-12 12:14:00 | 2011-04-12 12:14:00 | 306 | 0 | 15 | 42.29 | 1 | 1 | 2 | 4 | Silver |
5923 rows × 12 columns
# discover if there is a correlation between lifetime and customers' RFM scores
fig, ax = plt.subplots(figsize=(5, 5), constrained_layout=True)
ax.axis('on')
ax.bar(rfm['RFM Score'], rfm.groupby(['RFM Score'])['Lifetime'].transform('mean'))
ax.set_title('Average Lifetime for Each RFM Segments',
fontdict={'fontsize': '17', 'fontweight' : '3'})
plt.show()
# visualization of current customer base composition
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
sns.countplot(x="Loyalty", data=rfm, color='steelblue')
ax.bar_label(ax.containers[0])
[Text(0, 0, '1815'), Text(0, 0, '1697'), Text(0, 0, '2411')]
# Monetary vs Recency (combined)
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
sns.scatterplot(x='Interval', y='Total Spent', data=rfm, hue='Loyalty', size='Total Spent', palette=["gold", "green", "red"])
ax.set_xlim(1, 800)
ax.set_ylim(1, 4000)
# Frequency vs Recency (combined)
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
sns.scatterplot(x='Interval', y='Frequency', data=rfm, hue='Loyalty', size='Total Spent', palette=["gold", "green", "red"])
ax.set_xlim(1, 800)
ax.set_ylim(1, 2000)
# Monetary vs Frequency (combined)
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
sns.scatterplot(x='Total Spent', y='Frequency', data=rfm, hue='Loyalty', size='Total Spent', palette=["gold", "green", "red"])
ax.set_xlim(1, 4000)
ax.set_ylim(1, 1000)
(1.0, 1000.0)
import plotly.express as px
# Monetary vs Recency (combined)
fig1 = px.scatter(rfm, x='Interval', y='Total Spent', color='Loyalty')
fig1.update_layout(yaxis_range=[1,4000], xaxis_range=[1,800])
# Frequency vs Recency (combined)
fig2 = px.scatter(rfm, x='Interval', y='Frequency', color='Loyalty')
fig2.update_layout(yaxis_range=[1,800], xaxis_range=[1,800])
# Monetary vs Frequency (combined)
fig3 = px.scatter(rfm, x='Total Spent', y='Frequency', color='Loyalty')
fig3.update_layout(yaxis_range=[1,500], xaxis_range=[1,4000])